In [47]:
import pandas as pd
import numpy as np
import json
import os
import nltk
from nltk.classify import textcat
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.express as px
import wordcloud
from nltk.corpus import stopwords
from nltk import FreqDist
from wordcloud import wordcloud,STOPWORDS
from nltk.sentiment import SentimentIntensityAnalyzer
In [2]:
# Functions

def null_percentage(df):
    return round((df.isnull().sum() / len(df)) * 100,2)

def jason_to_dataframe(file_name):
    with open(file_name,"r",encoding="utf-8") as data_file:
        data = [json.loads(line) for line in data_file]
    return pd.DataFrame(data)
In [3]:
os.chdir("D:\\Data\\yelp_dataset")
os.listdir()
Out[3]:
['Dataset_User_Agreement.pdf',
 'yelp_academic_dataset_business.json',
 'yelp_academic_dataset_checkin.json',
 'yelp_academic_dataset_review.json',
 'yelp_academic_dataset_tip.json',
 'yelp_academic_dataset_user.json']
In [4]:
business_df = jason_to_dataframe("yelp_academic_dataset_business.json")
In [5]:
review_df = jason_to_dataframe("yelp_academic_dataset_review.json")
In [6]:
review_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6990280 entries, 0 to 6990279
Data columns (total 9 columns):
 #   Column       Dtype  
---  ------       -----  
 0   review_id    object 
 1   user_id      object 
 2   business_id  object 
 3   stars        float64
 4   useful       int64  
 5   funny        int64  
 6   cool         int64  
 7   text         object 
 8   date         object 
dtypes: float64(1), int64(3), object(5)
memory usage: 480.0+ MB
In [7]:
business_df.info()
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150346 entries, 0 to 150345
Data columns (total 14 columns):
 #   Column        Non-Null Count   Dtype  
---  ------        --------------   -----  
 0   business_id   150346 non-null  object 
 1   name          150346 non-null  object 
 2   address       150346 non-null  object 
 3   city          150346 non-null  object 
 4   state         150346 non-null  object 
 5   postal_code   150346 non-null  object 
 6   latitude      150346 non-null  float64
 7   longitude     150346 non-null  float64
 8   stars         150346 non-null  float64
 9   review_count  150346 non-null  int64  
 10  is_open       150346 non-null  int64  
 11  attributes    136602 non-null  object 
 12  categories    150243 non-null  object 
 13  hours         127123 non-null  object 
dtypes: float64(3), int64(2), object(9)
memory usage: 16.1+ MB
In [8]:
cls = textcat.TextCat()

review_df['text'][1:10].apply(lambda x : cls.guess_language(x))
Out[8]:
1    eng
2    eng
3    eng
4    eng
5    eng
6    eng
7    eng
8    eng
9    eng
Name: text, dtype: object
In [9]:
categories = business_df['categories'].str.lower().str.split(",",expand=True).stack().to_frame().reset_index()
In [10]:
categories = categories.drop(['level_0','level_1'],axis=1)
In [11]:
categories = categories.rename(columns={0:'Cat_Name'})
In [12]:
categories['Cat_Name'] = categories['Cat_Name'].str.strip()
In [13]:
categories = categories.value_counts().reset_index(name='count')
In [14]:
categories = categories.head(10)
In [15]:
plt.figure(figsize=(15,8))
sns.barplot(x='Cat_Name',y='count',data = categories,palette='rocket')
plt.xlabel("Name of the Business Category")
plt.ylabel("Count")
plt.title("Top 10 Categories Of Business")
plt.show()
In [16]:
plt.figure(figsize=(15,8))
city = business_df['city'].str.lower().value_counts()[:10].reset_index(name="count").rename(columns={'index' : 'City'})
sns.barplot(x='City',y='count',data=city,palette='rocket')
plt.ylabel("Count")
plt.xlabel("City Name")
plt.title("Top Ten Cities with the most business parties in Yelp")
plt.show()
In [17]:
philadelphia_business = business_df[business_df['city'].str.lower() == 'philadelphia']
In [18]:
color_scale = [(0, 'orange'), (1,'red')]
In [19]:
fig = px.scatter_mapbox(philadelphia_business, 
                        lat="latitude", 
                        lon="longitude", 
                        hover_name="name", 
                        hover_data=["name", "review_count","stars"],
                        color="review_count",
                        color_continuous_scale=color_scale,
                        size="stars",
                        zoom=8, 
                        height=800,
                        width=800)

fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()
In [20]:
top_10_business_review = review_df[review_df['stars'] == 5]['business_id'].value_counts().to_frame().reset_index().rename(columns={'business_id' : 'Count','index' : 'business_id'})[:10]
In [21]:
top_10_business_review = pd.merge(top_10_business_review,business_df,on='business_id')[['Count','name']]
In [22]:
plt.figure(figsize=(25,8))
sns.barplot(x='name',y='Count',data=top_10_business_review,palette='rocket')
plt.ylabel("Count")
plt.xlabel("Business Name")
plt.title("Name of the business and Count")
plt.show()
In [23]:
business_df[business_df['name'] == 'Reading Terminal Market'][['name','city','state','categories','postal_code']]
Out[23]:
name city state categories postal_code
143157 Reading Terminal Market Philadelphia PA Candy Stores, Shopping, Department Stores, Fas... 19107
In [24]:
Reading_Terminal_Market_review = review_df[review_df['business_id'] == 'ytynqOUb3hjKeJfRj5Tshw']
In [25]:
Reading_Terminal_Market_review_useful = Reading_Terminal_Market_review['useful'].value_counts().head(10).to_frame().reset_index().rename(columns={'useful' : 'Count','index' : 'useful'})
In [26]:
plt.figure(figsize=(15,5))
sns.barplot(x='useful',y='Count',data=Reading_Terminal_Market_review_useful,palette='rocket')
plt.ylabel("Count")
plt.xlabel("Useful Review")
plt.title("Useful Review and Count")
plt.show()
In [27]:
Reading_Terminal_Market_review_funny = Reading_Terminal_Market_review['funny'].value_counts().head(10).to_frame().reset_index().rename(columns={'funny' : 'Count','index' : 'funny'})
In [28]:
plt.figure(figsize=(15,5))
sns.barplot(x='funny',y='Count',data=Reading_Terminal_Market_review_funny,palette='rocket')
plt.ylabel("Count")
plt.xlabel("funny Review")
plt.title("funny Review and Count")
plt.show()
In [29]:
Reading_Terminal_Market_review_cool = Reading_Terminal_Market_review['cool'].value_counts().head(10).to_frame().reset_index().rename(columns={'cool' : 'Count','index' : 'cool'})
In [30]:
plt.figure(figsize=(15,5))
sns.barplot(x='cool',y='Count',data=Reading_Terminal_Market_review_cool,palette='rocket')
plt.ylabel("Count")
plt.xlabel("cool Review")
plt.title("cool Review and Count")
plt.show()
In [31]:
text = str(Reading_Terminal_Market_review['text'].values)
In [38]:
tokenizer = nltk.RegexpTokenizer(r"\w+")
rp_word = tokenizer.tokenize(text) 
In [39]:
filtered_words = [word for word in rp_word if word not in stopwords.words('english')]
In [45]:
fdist_words = FreqDist(filtered_words)
plt.figure(figsize=(20,5))
plt.scatter(*zip(*fdist_words.most_common(20)))
plt.show()
In [46]:
stop_w = STOPWORDS
wordcloud.WordCloud(background_color='white',stopwords=stop_w).generate(text).to_image()
Out[46]:
In [50]:
sia = SentimentIntensityAnalyzer()
In [59]:
sia.polarity_scores("I Hate You!!!")
Out[59]:
{'neg': 0.821, 'neu': 0.179, 'pos': 0.0, 'compound': -0.6784}
In [62]:
Reading_Terminal_Market_review['Negative'] = Reading_Terminal_Market_review['text'].apply(lambda x : sia.polarity_scores(x).get('neg')) 
C:\Users\samir\AppData\Local\Temp\ipykernel_8656\818546771.py:1: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [63]:
Reading_Terminal_Market_review['Positive'] = Reading_Terminal_Market_review['text'].apply(lambda x : sia.polarity_scores(x).get('pos'))
C:\Users\samir\AppData\Local\Temp\ipykernel_8656\4244575143.py:1: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [64]:
Reading_Terminal_Market_review['Compound'] = Reading_Terminal_Market_review['text'].apply(lambda x : sia.polarity_scores(x).get('compound'))
C:\Users\samir\AppData\Local\Temp\ipykernel_8656\699295148.py:1: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [66]:
def polarity_score(compound):
    if compound > 0.05:
        return "Positive"
    elif compound < -0.05:
        return "Negative"
    elif compound >= -0.05 and compound < 0.05:
        return "Neutral"
In [68]:
Reading_Terminal_Market_review['Sentiment'] = Reading_Terminal_Market_review['Compound'].apply(lambda x : polarity_score(x))
C:\Users\samir\AppData\Local\Temp\ipykernel_8656\700839630.py:1: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

In [69]:
Reading_Terminal_Market_review['Sentiment'].value_counts()
Out[69]:
Positive    5371
Negative     287
Neutral      120
Name: Sentiment, dtype: int64